library(tidyverse)
── Attaching core tidyverse packages ─────────────────────────────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ lubridate 1.9.3     ✔ tibble    3.2.1
✔ purrr     1.0.2     ✔ tidyr     1.3.0── Conflicts ───────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks plotly::filter(), stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
# read data
data <- read.csv('data/2018_Data.csv')
print(data)
View(data)
# Read required columns file (column)
column <- read.csv('data/2018_MetaData.csv')

# select the required columns from the total data
main_df <- select(data, column$Column_Name)
print(main_df)
main_df <- main_df[-c(1),]
rownames(main_df) <- 1:nrow(main_df)
print(main_df)
main_df <- main_df %>% 
           rename_at(vars(GEO_ID:DP05_0089E), ~ column$Label_Shortcut) %>% 
           separate_wider_delim(NAME, ",", names = c('COUNTY', 'STATE'))
           
main_df$COUNTY <- gsub(" County", "", as.character(main_df$COUNTY))
print(main_df)
# Check Null values
sum(is.null(main_df))
[1] 0
sapply(main_df, class)
                                          GEO_ID                                           COUNTY 
                                     "character"                                      "character" 
                                           STATE                              SA_TOTAL_POPULATION 
                                     "character"                                      "character" 
                                      SA_TP_MALE                                     SA_TP_FEMALE 
                                     "character"                                      "character" 
                                 SA_TP_SEX_RATIO                                    SA_TP_UNDER_5 
                                     "character"                                      "character" 
                                    SA_TP_5_TO_9                                   SA_TP_10_TO_14 
                                     "character"                                      "character" 
                                  SA_TP_15_TO_19                                   SA_TP_20_TO_24 
                                     "character"                                      "character" 
                                  SA_TP_25_TO_34                                   SA_TP_35_TO_44 
                                     "character"                                      "character" 
                                  SA_TP_45_TO_54                                   SA_TP_55_TO_59 
                                     "character"                                      "character" 
                                  SA_TP_60_TO_64                                   SA_TP_65_TO_74 
                                     "character"                                      "character" 
                                  SA_TP_75_TO_84                                   SA_TP_85_ABOVE 
                                     "character"                                      "character" 
                                SA_TP_MEDIAN_AGE                                   SA_TP_UNDER_18 
                                     "character"                                      "character" 
                                  SA_TP_16_ABOVE                                   SA_TP_18_ABOVE 
                                     "character"                                      "character" 
                                  SA_TP_21_ABOVE                                   SA_TP_62_ABOVE 
                                     "character"                                      "character" 
                                  SA_TP_65_ABOVE                              SA_TP_18_ABOVE_MALE 
                                     "character"                                      "character" 
                           SA_TP_18_ABOVE_FEMALE                         SA_TP_18_ABOVE_SEX_RATIO 
                                     "character"                                      "character" 
                             SA_TP_65_ABOVE_MALE                            SA_TP_65_ABOVE_FEMALE 
                                     "character"                                      "character" 
                        SA_TP_65_ABOVE_SEX_RATIO                                   RACE_TOTAL_POP 
                                     "character"                                      "character" 
                                  RACE_TP_1_RACE                          RACE_TP_2_OR_MORE_RACES 
                                     "character"                                      "character" 
                                 RACE_TP_1_WHITE                           RACE_TP_1_BLK_AFR_AMER 
                                     "character"                                      "character" 
                     RACE_TP_1_AMER_IND_ALAS_NAT                          RACE_TP_1_AIAN_CHEROKEE 
                                     "character"                                      "character" 
                         RACE_TP_1_AIAN_CHIPPEWA                                 RACE_TP_1_NAVAJO 
                                     "character"                                      "character" 
                                 RACE_TP_1_SIOUX                                  RACE_TP_1_ASIAN 
                                     "character"                                      "character" 
                          RACE_TP_1_ASIAN_INDIAN                          RACE_TP_1_ASIAN_CHINESE 
                                     "character"                                      "character" 
                        RACE_TP_1_ASIAN_JAPANESE                         RACE_TP_1_ASIAN_FILIPINO 
                                     "character"                                      "character" 
                          RACE_TP_1_ASIAN_KOREAN                       RACE_TP_1_ASIAN_VIETNAMESE 
                                     "character"                                      "character" 
                           RACE_TP_1_ASIAN_OTHER                    RACE_TP_1_NAT_HAW_OTH_PAC_ISD 
                                     "character"                                      "character" 
                    RACE_TP_1_NHOPI_NAT_HAWAIIAN               RACE_TP_1_NHOPI_GUAMANIAN_CHAMORRO 
                                     "character"                                      "character" 
                          RACE_TP_1_NHOPI_SAMOAN                      RACE_TP_1_NHOPI_OTH_PAC_ISD 
                                     "character"                                      "character" 
                         RACE_TP_1_SOME_OTH_RACE               RACE_TP_2_MORE_WHITE_BLK_AFR_AMERT 
                                     "character"                                      "character" 
          RACE_TP_2_MORE_WHITE_AMER_IND_ALAS_NAT                       RACE_TP_2_MORE_WHITE_ASIAN 
                                     "character"                                      "character" 
RACE_TP_2_MORE_BLK_AFR_AMER_AMER_INDIAN_ALAS_NAT                    RACE_ALONE_1_2_MORE_TOTAL_POP 
                                     "character"                                      "character" 
                    RACE_ALONE_1_2_MORE_TP_WHITE              RACE_ALONE_1_2_MORE_TP_BLK_AFR_AMER 
                                     "character"                                      "character" 
        RACE_ALONE_1_2_MORE_TP_AMER_IND_ALAS_NAT                     RACE_ALONE_1_2_MORE_TP_ASIAN 
                                     "character"                                      "character" 
      RACE_ALONE_1_2_MORE_TP_NAT_HAW_OTH_PAC_ISD             RACE_ALONE_1_2_MORE_TP_SOME_OTH_RACE 
                                     "character"                                      "character" 
                  HISPANIC_LATINO_RACE_TOTAL_POP                          HL_RACE_HISPANIC_LATINO 
                                     "character"                                      "character" 
                                 HL_RACE_MEXICAN                             HL_RACE_PUERTO_RICAN 
                                     "character"                                      "character" 
                                   HL_RACE_CUBAN                    HL_RACE_OTHER_HISPANIC_LATINO 
                                     "character"                                      "character" 
                HL_NOT_HISPANIC_LATINO_TOTAL_POP                          HL_NOT_HILA_WHITE_ALONE 
                                     "character"                                      "character" 
                        HL_NOT_HILA_BLK_AFR_AMER              HL_NOT_HILA_AMER_IND_ALAS_NAT_ALONE 
                                     "character"                                      "character" 
                         HL_NOT_HILA_ASIAN_ALONE            HL_NOT_HILA_NAT_HAW_OTH_PAC_ISD_ALONE 
                                     "character"                                      "character" 
                 HL_NOT_HILA_SOME_OTH_RACE_ALONE                          HL_NOT_HILA_2_MORE_RACE 
                                     "character"                                      "character" 
            HL_NOT_HILA_2_MORE_INC_SOME_OTH_RACE    HL_NOT_HILA_2_MORE_EXC_SOME_OTHE_RACE_3_ MORE 
                                     "character"                                      "character" 
                             TOTAL_HOUSING_UNITS                    CITIZEN_VOTE_AGE_POP_18_ABOVE 
                                     "character"                                      "character" 
              CITIZEN_VOTE_AGE_POP_18_ABOVE_MALE             CITIZEN_VOTE_AGE_POP_18_ABOVE_FEMALE 
                                     "character"                                      "character" 
# Function to convert from character to integer without error handling
convert_to_integer <- function(x) {
  as.integer(ifelse(grepl("^-?\\d+$", x), x, NA))
}

# Function to convert from character to numeric without error handling
convert_to_numeric <- function(x) {
  as.numeric(ifelse(grepl("^-?\\d+\\.?\\d*$", x), x, NA))
}

# Columns to convert to float
float_columns <- c("SA_TP_SEX_RATIO", 
                   "SA_TP_MEDIAN_AGE", 
                   "SA_TP_18_ABOVE_SEX_RATIO", 
                   "SA_TP_65_ABOVE_SEX_RATIO")

character_columns <- c("GEO_ID", "COUNTY", "STATE")

# Columns to convert to integer
integer_columns <- setdiff(names(main_df), c(float_columns, character_columns))

main_df <- main_df %>%
  mutate(across(all_of(float_columns), convert_to_numeric),
         across(all_of(integer_columns), convert_to_integer))
print(main_df)
state <- unique(main_df$STATE)
df_state <- data.frame(state)
print(df_state)

STATE - COUNTY - FIPS

df_fips <- read.csv("data/COUNTY-FIPS.csv")
df_fips

# add leading zeros
df_fips <- data.frame(distinct(df_fips))
df_fips$FIPS_CODE <- sprintf("%05d", df_fips$FIPS_CODE)
print(df_fips)
df_state_abbr <- read.csv("data/State-Abbr.csv")
df_state_abbr
df_fips_state <- merge(df_fips, df_state_abbr, by = c("STATE","STATE_ABBR"))
df_fips_state
df_fips_state <- df_fips_state[order(df_fips_state$STATE, df_fips_state$COUNTY), ]
rownames(df_fips_state) <- 1:nrow(df_fips_state)
df_fips_state$COUNTY <- gsub(" County", "", as.character(df_fips_state$COUNTY))
df_fips_state
# write.csv(df_fips_state, "data/County-Fips.csv", row.names = FALSE)
# Trim Preceding or succeeding spaces
main_df$STATE <- trimws(main_df$STATE)

df <- merge(main_df, df_fips_state, by = c("COUNTY", "STATE"))
df
print(sum(is.na(df)))
[1] 29
# Get null values in dataframe
df[!complete.cases(df), ]
df$SA_TP_65_ABOVE_SEX_RATIO[is.na(df$SA_TP_65_ABOVE_SEX_RATIO)] <- 0
print(sum(is.na(df)))
[1] 0
df <- df[order(df$STATE, df$COUNTY), ]
rownames(df) <- 1:nrow(df)
df

# write.csv(df, "data/Cleaned_df.csv", row.names = FALSE )
# Check the uncompatible lines

# missing_combinations <- anti_join(main_df, df_fips_state, by = c("COUNTY", "STATE"))
# missing_combinations
LS0tDQp0aXRsZTogIkRhdGEgQ2xlYW5pbmciDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQoNCmBgYHtyfQ0KbGlicmFyeSh0aWR5dmVyc2UpDQpgYGANCg0KYGBge3J9DQojIHJlYWQgZGF0YQ0KZGF0YSA8LSByZWFkLmNzdignZGF0YS8yMDE4X0RhdGEuY3N2JykNCnByaW50KGRhdGEpDQpgYGANCg0KYGBge3J9DQojIFJlYWQgcmVxdWlyZWQgY29sdW1ucyBmaWxlIChjb2x1bW4pDQpjb2x1bW4gPC0gcmVhZC5jc3YoJ2RhdGEvMjAxOF9NZXRhRGF0YS5jc3YnKQ0KDQojIHNlbGVjdCB0aGUgcmVxdWlyZWQgY29sdW1ucyBmcm9tIHRoZSB0b3RhbCBkYXRhDQptYWluX2RmIDwtIHNlbGVjdChkYXRhLCBjb2x1bW4kQ29sdW1uX05hbWUpDQpwcmludChtYWluX2RmKQ0KYGBgDQoNCmBgYHtyfQ0KbWFpbl9kZiA8LSBtYWluX2RmWy1jKDEpLF0NCmBgYA0KDQpgYGB7cn0NCnJvd25hbWVzKG1haW5fZGYpIDwtIDE6bnJvdyhtYWluX2RmKQ0KYGBgDQoNCmBgYHtyfQ0KcHJpbnQobWFpbl9kZikNCmBgYA0KDQoNCg0KYGBge3J9DQptYWluX2RmIDwtIG1haW5fZGYgJT4lIA0KICAgICAgICAgICByZW5hbWVfYXQodmFycyhHRU9fSUQ6RFAwNV8wMDg5RSksIH4gY29sdW1uJExhYmVsX1Nob3J0Y3V0KSAlPiUgDQogICAgICAgICAgIHNlcGFyYXRlX3dpZGVyX2RlbGltKE5BTUUsICIsIiwgbmFtZXMgPSBjKCdDT1VOVFknLCAnU1RBVEUnKSkNCiAgICAgICAgICAgDQpgYGANCg0KYGBge3J9DQptYWluX2RmJENPVU5UWSA8LSBnc3ViKCIgQ291bnR5IiwgIiIsIGFzLmNoYXJhY3RlcihtYWluX2RmJENPVU5UWSkpDQpgYGANCg0KDQpgYGB7cn0NCnByaW50KG1haW5fZGYpDQpgYGANCg0KYGBge3J9DQojIENoZWNrIE51bGwgdmFsdWVzDQpzdW0oaXMubnVsbChtYWluX2RmKSkNCmBgYA0KDQpgYGB7cn0NCnNhcHBseShtYWluX2RmLCBjbGFzcykNCmBgYA0KDQpgYGB7cn0NCiMgRnVuY3Rpb24gdG8gY29udmVydCBmcm9tIGNoYXJhY3RlciB0byBpbnRlZ2VyIHdpdGhvdXQgZXJyb3IgaGFuZGxpbmcNCmNvbnZlcnRfdG9faW50ZWdlciA8LSBmdW5jdGlvbih4KSB7DQogIGFzLmludGVnZXIoaWZlbHNlKGdyZXBsKCJeLT9cXGQrJCIsIHgpLCB4LCBOQSkpDQp9DQoNCiMgRnVuY3Rpb24gdG8gY29udmVydCBmcm9tIGNoYXJhY3RlciB0byBudW1lcmljIHdpdGhvdXQgZXJyb3IgaGFuZGxpbmcNCmNvbnZlcnRfdG9fbnVtZXJpYyA8LSBmdW5jdGlvbih4KSB7DQogIGFzLm51bWVyaWMoaWZlbHNlKGdyZXBsKCJeLT9cXGQrXFwuP1xcZCokIiwgeCksIHgsIE5BKSkNCn0NCg0KIyBDb2x1bW5zIHRvIGNvbnZlcnQgdG8gZmxvYXQNCmZsb2F0X2NvbHVtbnMgPC0gYygiU0FfVFBfU0VYX1JBVElPIiwgDQogICAgICAgICAgICAgICAgICAgIlNBX1RQX01FRElBTl9BR0UiLCANCiAgICAgICAgICAgICAgICAgICAiU0FfVFBfMThfQUJPVkVfU0VYX1JBVElPIiwgDQogICAgICAgICAgICAgICAgICAgIlNBX1RQXzY1X0FCT1ZFX1NFWF9SQVRJTyIpDQoNCmNoYXJhY3Rlcl9jb2x1bW5zIDwtIGMoIkdFT19JRCIsICJDT1VOVFkiLCAiU1RBVEUiKQ0KDQojIENvbHVtbnMgdG8gY29udmVydCB0byBpbnRlZ2VyDQppbnRlZ2VyX2NvbHVtbnMgPC0gc2V0ZGlmZihuYW1lcyhtYWluX2RmKSwgYyhmbG9hdF9jb2x1bW5zLCBjaGFyYWN0ZXJfY29sdW1ucykpDQoNCm1haW5fZGYgPC0gbWFpbl9kZiAlPiUNCiAgbXV0YXRlKGFjcm9zcyhhbGxfb2YoZmxvYXRfY29sdW1ucyksIGNvbnZlcnRfdG9fbnVtZXJpYyksDQogICAgICAgICBhY3Jvc3MoYWxsX29mKGludGVnZXJfY29sdW1ucyksIGNvbnZlcnRfdG9faW50ZWdlcikpDQpgYGANCg0KYGBge3J9DQpwcmludChtYWluX2RmKQ0KYGBgDQoNCmBgYHtyfQ0Kc3RhdGUgPC0gdW5pcXVlKG1haW5fZGYkU1RBVEUpDQpkZl9zdGF0ZSA8LSBkYXRhLmZyYW1lKHN0YXRlKQ0KcHJpbnQoZGZfc3RhdGUpDQpgYGANCg0KDQojIyMgU1RBVEUgLSBDT1VOVFkgLSBGSVBTDQoNCg0KYGBge3J9DQpkZl9maXBzIDwtIHJlYWQuY3N2KCJkYXRhL0NPVU5UWS1GSVBTLmNzdiIpDQpkZl9maXBzDQpgYGANCg0KDQpgYGB7cn0NCg0KIyBhZGQgbGVhZGluZyB6ZXJvcw0KZGZfZmlwcyA8LSBkYXRhLmZyYW1lKGRpc3RpbmN0KGRmX2ZpcHMpKQ0KZGZfZmlwcyRGSVBTX0NPREUgPC0gc3ByaW50ZigiJTA1ZCIsIGRmX2ZpcHMkRklQU19DT0RFKQ0KcHJpbnQoZGZfZmlwcykNCmBgYA0KDQpgYGB7cn0NCmRmX3N0YXRlX2FiYnIgPC0gcmVhZC5jc3YoImRhdGEvU3RhdGUtQWJici5jc3YiKQ0KZGZfc3RhdGVfYWJicg0KYGBgDQoNCmBgYHtyfQ0KZGZfZmlwc19zdGF0ZSA8LSBtZXJnZShkZl9maXBzLCBkZl9zdGF0ZV9hYmJyLCBieSA9IGMoIlNUQVRFIiwiU1RBVEVfQUJCUiIpKQ0KZGZfZmlwc19zdGF0ZQ0KYGBgDQoNCmBgYHtyfQ0KZGZfZmlwc19zdGF0ZSA8LSBkZl9maXBzX3N0YXRlW29yZGVyKGRmX2ZpcHNfc3RhdGUkU1RBVEUsIGRmX2ZpcHNfc3RhdGUkQ09VTlRZKSwgXQ0Kcm93bmFtZXMoZGZfZmlwc19zdGF0ZSkgPC0gMTpucm93KGRmX2ZpcHNfc3RhdGUpDQpkZl9maXBzX3N0YXRlJENPVU5UWSA8LSBnc3ViKCIgQ291bnR5IiwgIiIsIGFzLmNoYXJhY3RlcihkZl9maXBzX3N0YXRlJENPVU5UWSkpDQpkZl9maXBzX3N0YXRlDQpgYGANCg0KDQpgYGB7cn0NCiMgd3JpdGUuY3N2KGRmX2ZpcHNfc3RhdGUsICJkYXRhL0NvdW50eS1GaXBzLmNzdiIsIHJvdy5uYW1lcyA9IEZBTFNFKQ0KYGBgDQoNCmBgYHtyfQ0KIyBUcmltIFByZWNlZGluZyBvciBzdWNjZWVkaW5nIHNwYWNlcw0KbWFpbl9kZiRTVEFURSA8LSB0cmltd3MobWFpbl9kZiRTVEFURSkNCg0KZGYgPC0gbWVyZ2UobWFpbl9kZiwgZGZfZmlwc19zdGF0ZSwgYnkgPSBjKCJDT1VOVFkiLCAiU1RBVEUiKSkNCmRmDQpgYGANCg0KYGBge3J9DQpwcmludChzdW0oaXMubmEoZGYpKSkNCg0KIyBHZXQgbnVsbCB2YWx1ZXMgaW4gZGF0YWZyYW1lDQpkZlshY29tcGxldGUuY2FzZXMoZGYpLCBdDQpgYGANCg0KYGBge3J9DQpkZiRTQV9UUF82NV9BQk9WRV9TRVhfUkFUSU9baXMubmEoZGYkU0FfVFBfNjVfQUJPVkVfU0VYX1JBVElPKV0gPC0gMA0KcHJpbnQoc3VtKGlzLm5hKGRmKSkpDQpgYGANCg0KYGBge3J9DQpkZiA8LSBkZltvcmRlcihkZiRTVEFURSwgZGYkQ09VTlRZKSwgXQ0Kcm93bmFtZXMoZGYpIDwtIDE6bnJvdyhkZikNCmRmDQpgYGANCg0KJw0KYGBge3J9DQojIHdyaXRlLmNzdihkZiwgImRhdGEvQ2xlYW5lZF9kZi5jc3YiLCByb3cubmFtZXMgPSBGQUxTRSApDQpgYGANCg0KYGBge3J9DQojIENoZWNrIHRoZSB1bmNvbXBhdGlibGUgbGluZXMNCg0KIyBtaXNzaW5nX2NvbWJpbmF0aW9ucyA8LSBhbnRpX2pvaW4obWFpbl9kZiwgZGZfZmlwc19zdGF0ZSwgYnkgPSBjKCJDT1VOVFkiLCAiU1RBVEUiKSkNCiMgbWlzc2luZ19jb21iaW5hdGlvbnMNCmBgYA0KDQpgYGB7cn0NCg0KYGBgDQoNCg0KDQoNCg0K